Center for Skeletal Research Data Analysis Services. Authored by Catherine M. May, Boston Children’s Hospital.

Parts of this script were adapted from the Harvard Chan Bioinformatic Core and Bioconductor. https://hbctraining.github.io/DGE_workshop_salmon_online/schedule/. As well as Lorena Pantano Harvard TH Chan School of Public Health. https://bioconductor.org/packages/release/bioc/vignettes/DEGreport/inst/doc/DEGreport.html.

# Make sure you have Bioconductor installed
# if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
# BiocManager::install(version = "3.11")
# BiocManager::install(c("DESeq2","AnnotationDbi","tximport","tximportData","pheatmap","clusterProfiler","DEGreport","GSEABase","DOSE","rhdf5","org.Hs.eg.db","org.Mm.eg.db","EnsDb.Hsapiens.v86","EnsDb.Mmusculus.v79","AnnotationDbi","AnnotationHub","AnnotationFilter","ensembldb","annotables","TxDb.Hsapiens.UCSC.hg19.knownGene","TxDb.Mmusculus.UCSC.mm9.knownGene"))

# You need to load the libraries every session.
library(devtools)
library(BiocManager)
library(tximport)
library(tximportData)
library(ggpubr)
library(ggpubr)
library(ggplot2)
library(ggfortify)
library(RColorBrewer)
library(pheatmap)
library(clusterProfiler)
library(DEGreport)
library(GSEABase)
library(DOSE)
library(rhdf5)
library(stringr)
library(tidyr)
library(pathview)
library(org.Mm.eg.db)
library(EnsDb.Mmusculus.v79)
library(AnnotationDbi)
library(AnnotationHub)
library(AnnotationFilter)
library(ggrepel)
library(ensembldb)
library(cowplot)
library(annotables)
library(clusterProfiler)
library(DESeq2)
library(TxDb.Mmusculus.UCSC.mm9.knownGene)
library(tidyverse)  #includes both library(dplyr) library(tibble)   

# Remove all environmental variables every time. 
# Sometimes variables left over from previous sessions can cause errors.
rm(list=ls())

# Set your working directory. Put all data in this folder.
setwd("~/Desktop/rstudio/CSR/osteocalcin")

# For reproducibility and methods it is recommended you record your R environment.
# methods <- session_info()
# writeLines(capture.output(sessionInfo()), "sessionInfo.txt")

(1) Data Import

Import the Kallisto counts files, the metadata file and the tx2gene annotation file.

Load the annotation table

For Grcm38 using the EnsDB. You can build your own custom annotation table using the instrutions at: https://hbctraining.github.io/DGE_workshop_salmon/lessons/AnnotationDbi_lesson.html.

#tx2gene   <- read.delim("tx2gene_grch38_ens94.txt")                   # Human
tx2gene   <- read.delim("tx2gene_grcm38_v79_wtransgenes_4Aug20.txt")   # Mouse with transgenes added in

Load counts data from Folders

Automatic file loading requires you structure your folder hierarchy in a specific way (ie. ~/Desktop/rstudio/FC04020/data/S8.kallisto/abundance.tsv). Folders ending in the .kallisto suffix will be searched, for the abundance.tsv files which will be loaded into the variable ‘files.’

dir          <- ("~/Desktop/rstudio/CSR/osteocalcin/")
samples_all  <- list.files(path = "./data", full.names = T, pattern="kallisto") # Load all data

samples_ko   <- list.files(path = "./data", full.names = T, pattern="KO")       # Load KO
samples_wt   <- list.files(path = "./data", full.names = T, pattern="WT")       # Load WT
samples_uk   <- list.files(path = "./data", full.names = T, pattern="UK")       # Load UK

# Set up your groups for later DEG contrasts
samples_ko_wt    <- append(samples_ko,samples_wt)                               # Merge two datasets
samples_uk_wt    <- append(samples_uk,samples_wt)   
samples_tissue   <- samples_all

# This gets the file path for each sample
files_all     <- file.path(samples_all,    "abundance.tsv")
files_ko_wt   <- file.path(samples_ko_wt,  "abundance.tsv")
files_uk_wt   <- file.path(samples_uk_wt,  "abundance.tsv")
files_tissue  <- file.path(samples_tissue, "abundance.tsv")

# This will strip off the file path info, leaving only the names of the files
# Note: the %>% is a pipe that passes information from one argument to another
names(files_all)     <- str_replace(samples_all,    "./data/", "") %>% str_replace(".kallisto", "")  
names(files_ko_wt)   <- str_replace(samples_ko_wt,  "./data/", "") %>% str_replace(".kallisto", "")
names(files_uk_wt)   <- str_replace(samples_uk_wt,  "./data/", "") %>% str_replace(".kallisto", "")  
names(files_tissue)  <- str_replace(samples_tissue, "./data/", "") %>% str_replace(".kallisto", "")  

# This checks that for each sample folder there is an abundance.tsv file
all(file.exists(files_all))
all(file.exists(files_ko_wt))
all(file.exists(files_uk_wt))
all(file.exists(files_tissue))

# If you want to find out what a function does, type
#  ?function
#  ie ?str_replace

TxImport

Imports transcript-level abundance, estimate counts and transcript lengths, and generate a matrix

# Ignore transcript version if needed ie. ENSMUSG00000000300.1, ENSMUSG00000000300.2 should not be counted as separate transcripts just because they are different versions. 
# add this inside the tximport function if needed: ignoreAfterBar = TRUE, ignoreTxVersion = TRUE, txOut=F
txi_all     <- tximport(files_all,    type = "kallisto", tx2gene=tx2gene, ignoreTxVersion = T)
txi_ko_wt   <- tximport(files_ko_wt,  type = "kallisto", tx2gene=tx2gene, ignoreTxVersion = T)
txi_uk_wt   <- tximport(files_uk_wt,  type = "kallisto", tx2gene=tx2gene, ignoreTxVersion = T)
txi_tissue  <- tximport(files_tissue, type = "kallisto", tx2gene=tx2gene, ignoreTxVersion = T)

# Check your imported data for issues
# head(txi_all$counts)                      # Check the first ten lines
# tail(txi_all$counts)                      # Check the last ten lines  
# dim(txi_all$counts)                       # Check the dimensions
# colnames(txi_all$counts)                  # Check the column names
# summary(txi_all$counts)                   # Get a summary of the data
# class(tx2gene)                            # Check the class

# Create a dataframe with the counts
data    <-  txi_all$counts   %>%  round()  %>%  data.frame()

# Save the raw data
write.csv(data, file="~/Desktop/rstudio/CSR/osteocalcin/results/Raw_counts.csv", sep="\t", quote=F, col.names=NA)

Import Metadata

# Read in metadata from a .csv file
meta   <-  read.csv("~/Desktop/rstudio/CSR/osteocalcin/osteocalcin_metadata_4Aug20.csv", stringsAsFactors=T)  

# Convert from integer to factor to improve plotting
# meta$replicate <-  as.factor(meta$replicate)
# meta$library   <-  as.factor(meta$library)
# meta$tissue    <-  as.factor(meta$tissue)

meta_all    <-  meta[c(1:14),]      
meta_ko_wt  <-  meta[c(1:6,11:14),] 
meta_uk_wt  <-  meta[c(7:10,11:14),]    
meta_tissue <-  meta[c(1:14),]    

# # Check to see if a factor is integer or factor
# class(meta$replicate)
# class(meta$library)

(2) Data Assessment

Plot the distribution of the data and check for any potential issues.

Average replicates

In case you want to look at averaged data for the replicates. Not recommended for the DEG, as DESeq models will already account for replicates.

# Show(txi_avg$counts)        #To find the column numbers
# Average columns manually
avg_wt   <-  ((txi_all$counts[,2] + txi_all$counts[,6] + txi_all$counts[,7] + txi_all$counts[,8]) / 4)
avg_ko   <-  ((txi_all$counts[,1] + txi_all$counts[,3] + txi_all$counts[,4] + txi_all$counts[,5]  + txi_all$counts[,9] + txi_all$counts[,10]) / 6)

# Step 3: Create a new dataframe with the averaged columns
data_avg  <-  cbind(avg_wt, avg_ko)
#Show(data_avg)

Histograms of Raw Counts

Check for outlier samples

avg1_plot  <- ggplot(data_avg) + geom_histogram(aes(x = avg_wt), stat = "bin", bins = 200) + xlim(0,20000) + ylim(0,2000) +
                     xlab("Raw expression counts") + ylab("Number of genes") + ggtitle("WT")  
avg2_plot  <- ggplot(data_avg) + geom_histogram(aes(x = avg_ko), stat = "bin", bins = 200) + xlim(0,20000) + ylim(0,2000) +
                     xlab("Raw expression counts") + ylab("Number of genes")  + ggtitle("KO")

Data Distribution

This will help determine what model distribution to use. Most RNA-seq data falls along a Negative Binomial (NB) distribution.

Calculate the mean and variance

To use a poisson distribution, it assumes that mean equals variance. Plot counts mean vs. variance to check if assumption holds.

# Select data to use and find mean and var
mean_counts_wt       <-  apply(data[,c(11:14)], 1, mean)      
variance_counts_wt   <-  apply(data[,c(11:14)], 1, var)

mean_counts_ko       <-  apply(data[,c(1:6)], 1, mean) 
variance_counts_ko   <-  apply(data[,c(1:6)], 1, var) 

# Create a new dataframe with mean and variance
df_wt   <-  data.frame(mean_counts_wt, variance_counts_wt)
df_ko   <-  data.frame(mean_counts_ko, variance_counts_ko)

Plot Mean vs. Variance

The NB distribution assumes that the mean is not equal to variance. If the scatter falls on the red line, then you may have to use a different distribution. Normally the scatter falls to the left of the line (ie low mean counts have higher variance)

wt_plot     <- ggplot(df_wt) + geom_point(aes(x=mean_counts_wt, 
                     y=variance_counts_wt)) + ggtitle("WT Counts") + 
                     geom_abline(intercept = 0, slope = 1, color="red") +
                     scale_y_log10(limits = c(500,1e9)) + scale_x_log10(limits = c(100,1e9))  
              
ko_plot     <- ggplot(df_ko) + geom_point(aes(x=mean_counts_ko, 
                     y=variance_counts_ko)) + ggtitle("KO Counts") + 
                     geom_abline(intercept = 0, slope = 1, color="red") +
                     scale_y_log10(limits = c(500,1e9)) + scale_x_log10(limits = c(100,1e9)) 

Match metadata and counts matrix

Ensure that the counts labels and metadata match.

meta_names_all       <-  factor(c("KO_S10_4020","KO_S4_4071","KO_S5_4071","KO_S6_4071","KO_S9_4020", "KO_S9_4071",
                                  "UK_S4_4000","UK_S5_4000","UK_S6_4000","UK_S7_4000",
                                  "WT_S11_4020","WT_S7_4071","WT_S8_4020","WT_S8_4071"))
meta_names_ko_wt     <-  factor(c("KO_S10_4020","KO_S4_4071","KO_S5_4071","KO_S6_4071","KO_S9_4020", "KO_S9_4071",
                                  "WT_S11_4020","WT_S7_4071","WT_S8_4020","WT_S8_4071"))
meta_names_uk_wt     <-  factor(c("UK_S4_4000","UK_S5_4000","UK_S6_4000","UK_S7_4000",
                                  "WT_S11_4020","WT_S7_4071","WT_S8_4020","WT_S8_4071"))
meta_names_tissue    <-  factor(c("KO_S10_4020","KO_S4_4071","KO_S5_4071","KO_S6_4071","KO_S9_4020", "KO_S9_4071",
                                  "UK_S4_4000","UK_S5_4000","UK_S6_4000","UK_S7_4000",
                                  "WT_S11_4020","WT_S7_4071","WT_S8_4020","WT_S8_4071"))

rownames(meta_all)     <-  meta_names_all
rownames(meta_ko_wt)   <-  meta_names_ko_wt
rownames(meta_uk_wt)   <-  meta_names_uk_wt
rownames(meta_tissue)  <-  meta_names_tissue

# Does data in x match data in y?
print("Does data in txi counts match data in metadata?")
all(colnames(txi_all$counts)     %in%  rownames(meta_all))
all(colnames(txi_ko_wt$counts)   %in%  rownames(meta_ko_wt))
all(colnames(txi_uk_wt$counts)   %in%  rownames(meta_uk_wt))
all(colnames(txi_tissue$counts)  %in%  rownames(meta_tissue))

# Is data in the same order in x as y?
print("Are txi counts in the same order as in metadata?")
all(colnames(txi_all$counts)     ==  rownames(meta_all))
all(colnames(txi_ko_wt$counts)   ==  rownames(meta_ko_wt))
all(colnames(txi_uk_wt$counts)   ==  rownames(meta_uk_wt))
all(colnames(txi_tissue$counts)  ==  rownames(meta_tissue))

Reorder samples

Samples in dataframe to match sample name order in metadata

# Check sample name order in both the dataframe and metadata file
colnames(data)

# If they don't match, then use the match function to re-arrange
# First find index numbers
idx_all     <-  match(rownames(meta_all), colnames(txi_all$counts))      
idx_ko_wt   <-  match(rownames(meta_ko_wt), colnames(txi_ko_wt$counts))    
idx_uk_wt   <-  match(rownames(meta_uk_wt), colnames(txi_uk_wt$counts))      
idx_tissue  <-  match(rownames(meta_tissue), colnames(txi_tissue$counts))      

# Use this to re-order
txi_all$counts     <-  txi_all$counts[, idx_all]           
txi_ko_wt$counts   <-  txi_ko_wt$counts[, idx_ko_wt] 
txi_uk_wt$counts   <-  txi_uk_wt$counts[, idx_uk_wt]           
txi_tissue$counts  <-  txi_tissue$counts[, idx_tissue]           

# Check that you re-ordered the data correctly.  
# Mismatches will cause errors in the downstream analysis.
print("Do column names match now between txi counts and metadata?")
all(colnames(txi_all$counts)     %in%  rownames(meta_all))
all(colnames(txi_ko_wt$counts)   %in%  rownames(meta_ko_wt))
all(colnames(txi_uk_wt$counts)   %in%  rownames(meta_uk_wt))
all(colnames(txi_tissue$counts)  %in%  rownames(meta_tissue))

print("Are column names now in the same order in txi counts and metadata?")
all(colnames(txi_all$counts)     ==  rownames(meta_all))
all(colnames(txi_ko_wt$counts)   ==  rownames(meta_ko_wt))
all(colnames(txi_uk_wt$counts)   ==  rownames(meta_uk_wt))
all(colnames(txi_tissue$counts)  ==  rownames(meta_tissue))

(3) Normalization

Create DESEq2 object

Most of the analyses will rely on accessing or recording information to the DESeq2 object

# I made a separate object for each comparison
dds_all     <-  DESeqDataSetFromTximport(txi_all,    colData = meta_all,    design = ~ library)      
dds_ko_wt   <-  DESeqDataSetFromTximport(txi_ko_wt,  colData = meta_ko_wt,  design = ~ genotype)    
dds_uk_wt   <-  DESeqDataSetFromTximport(txi_uk_wt,  colData = meta_uk_wt,  design = ~ genotype)      
dds_tissue  <-  DESeqDataSetFromTximport(txi_tissue, colData = meta_tissue, design = ~ tissue)      

# To build a more complex model to include other variables see below or google "design formula"
# The formula below can be interpreted as "counts explained by tissue source + batch + cell type)
# dds_complex <- DESeqDataSetFromTximport(txi, colData=meta, design = ~ tissue + batch + celltype)   

# Check to make sure the objects were created correctly
# Show(counts(dds_tissue))
# dds_tissue@colData@rownames

Normalize Count Data

DESeq2 uses Median of Ratios as a normalization method

# Generate size factors for the median of ratios method of normalization
dds_all     <- estimateSizeFactors(dds_all)
dds_ko_wt   <- estimateSizeFactors(dds_ko_wt)
dds_uk_wt   <- estimateSizeFactors(dds_uk_wt)
dds_tissue  <- estimateSizeFactors(dds_tissue)

# Fill the slots of the DESeqDataSet object with the appropriate information
sizeFactors(dds_all)
sizeFactors(dds_ko_wt)
sizeFactors(dds_uk_wt)
sizeFactors(dds_tissue)

# Retrieve the normalized counts matrix from dds 
norm_counts_all     <- counts(dds_all,    normalized=TRUE)
norm_counts_ko_wt   <- counts(dds_ko_wt,  normalized=TRUE)
norm_counts_uk_wt   <- counts(dds_uk_wt,  normalized=TRUE)
norm_counts_tissue  <- counts(dds_tissue, normalized=TRUE)

Tranform counts

rlog is the standard

# rlog() function returns a DESeqTransform object, used for the PCA and hierarchical clustering
# blind=true means that the transformation in unbiased, not taking sample into consideration
rld_all      <-  rlog(dds_all,    blind=TRUE)  
rld_ko_wt    <-  rlog(dds_ko_wt,  blind=TRUE)  
rld_uk_wt    <-  rlog(dds_uk_wt,  blind=TRUE)   
rld_tissue   <-  rlog(dds_tissue, blind=TRUE)   

(4) PCA Plots

Principle Component Analysis by sample groups. Data performed on normalized r-log transformed data.

Genotype PCA (WT vs KO)

# If you use the <- to assign a plot to a variable
# You can "call" the plot by the name to visualize or save it
plot1a  <-  plotPCA(rld_ko_wt, intgroup="genotype")
plot1a  <-  plot1a + ggtitle("Genotype (WT vs KO) PCA")
plot1a

Library PCA

rld_tissue$tissue <-  as.factor(rld_tissue$tissue)
plot1b2 <- plotPCA(rld_tissue, intgroup="tissue")
plot1b2 <- plot1b2 + ggtitle("Tissue PCA")
plot1b2

Library PCA

rld_all$library <-  as.factor(rld_all$library)
plot1b <- plotPCA(rld_all, intgroup="library")
plot1b <- plot1b + ggtitle("Library PCA")
plot1b

ggsave("PCA_library.tiff", plot = plot1b, device = "tiff", scale = 1.5, dpi = 300, 
       path ="~/Desktop/rstudio/CSR/osteocalcin/results", width = 7, height = 5, units = "in")

Replicate PCA

rld_all$replicate <-  as.factor(rld_all$replicate)
plot1c <- plotPCA(rld_all, intgroup="replicate")
plot1c <- plot1c + ggtitle("Replicate PCA")
plot1c

ggsave("PCA_replicates.tiff", plot = plot1c, device = "tiff", scale = 1.5, dpi = 300, 
       path ="~/Desktop/rstudio/CSR/osteocalcin/results", width = 7, height = 5, units = "in")

Compare all Tissue PCAs

Make sure that there is not a correlation of data with confounding factors, like batch or replicate number.

(5) Heatmaps

Heatmap Correlation

Pearson Correlation is the default for the function cor. Type ?cor for more info.

# Extract the rlog matrix from the object
rld_mat_all     <- assay(rld_all)   
rld_mat_ko_wt   <- assay(rld_ko_wt)   
rld_mat_uk_wt   <- assay(rld_uk_wt)   
rld_mat_tissue  <- assay(rld_tissue)   

# Compute pairwise correlation values
rld_cor_all     <- cor(rld_mat_all)   
rld_cor_ko_wt   <- cor(rld_mat_ko_wt)   
rld_cor_uk_wt   <- cor(rld_mat_uk_wt)   
rld_cor_tissue  <- cor(rld_mat_tissue)   

# Plot heatmap using the correlation matrix and the metadata object
# To find specific columns for the legend, type colnames(meta) and select the columns.
meta1     <- meta_all[,c(3:6)]
meta2     <- meta_ko_wt[,c(2:3,5)]
meta3     <- meta_uk_wt[,c(2:3,5)]
meta4     <- meta_tissue[,c(2:4)]

#all_plot1 <- pheatmap(rld_cor_all,    annotation = meta1, cluster_rows = TRUE, main="Osteocalcin Heatmap (KO vs WT)", fontsize = 6) 
all_plot2 <- pheatmap(rld_cor_ko_wt,  annotation = meta2, cluster_rows = TRUE, main="Osteocalcin Heatmap (KO vs WT)", fontsize = 6) 

all_plot3 <- pheatmap(rld_cor_uk_wt,  annotation = meta3, cluster_rows = TRUE, main="Osteocalcin Heatmap (UK vs WT)", fontsize = 6) 

all_plot4 <- pheatmap(rld_cor_tissue, annotation = meta4, cluster_rows = TRUE, main="Osteocalcin Heatmap (Bone vs Organ)", fontsize = 6) 

(6) Differential Gene Expression

The primary comparison is WT vs. KO.

Design formulas

Specify what factors you want to look at.

# '~ treatment' is interpreted as "read counts explained by treatment"
design(dds_all)      <-  formula(~ library)         
design(dds_ko_wt)    <-  formula(~ genotype)       
design(dds_uk_wt)    <-  formula(~ genotype)         
design(dds_tissue)   <-  formula(~ tissue)         

# Differential expression analysis based on the Negative Binomial distribution
# type ?DESeq for more info
dds_all     <- DESeq(dds_all)
dds_ko_wt   <- DESeq(dds_ko_wt)
dds_uk_wt   <- DESeq(dds_uk_wt)
dds_tissue  <- DESeq(dds_tissue)

res_all     <- results(dds_all)
res_ko_wt   <- results(dds_ko_wt)
res_uk_wt   <- results(dds_uk_wt)
res_tissue  <- results(dds_tissue)

Wald Test for Significance

By default DESeq2 uses the Wald test to identify genes that are differentially expressed between two sample classes. Use with 2 groups of comparison. For 3+ groups use LRT.

# Example: contrast <- c("sampletype", "overexpression", "control") 
# Set up you contrasts. Base level (control) goes last
# Deciding the base level will determine how to interpret the fold change

# Compare WT vs KO
contrast_ko_wt   <-  c("genotype", "KO",    "WT")  
contrast_uk_wt   <-  c("genotype", "UK",    "WT")  
contrast_tissue  <-  c("tissue",   "organ", "bone")  

results(dds_ko_wt,  contrast = contrast_ko_wt)
results(dds_uk_wt,  contrast = contrast_uk_wt)
results(dds_tissue, contrast = contrast_tissue)

# Extract results
res_ko_wt   <-  results(dds_ko_wt,  contrast = contrast_ko_wt,  alpha = 0.15, lfcThreshold = 0.1)     ##############   THIS SHOULD BE 0.05 OR LESS   ##############
res_uk_wt   <-  results(dds_uk_wt,  contrast = contrast_uk_wt,  alpha = 0.05, lfcThreshold = 0.1)     ##############   THIS SHOULD BE 0.05 OR LESS   ##############
res_tissue  <-  results(dds_tissue, contrast = contrast_tissue, alpha = 0.05, lfcThreshold = 0.1)     ##############   THIS SHOULD BE 0.05 OR LESS   ##############

# The results table that is returned to us is a DESeqResults object
# class(res_tissue)

# Check what is stored in the results
# res_tissue  %>%  data.frame()  %>%  Show()

Gene-level Filtering

This is important because you don’t want to include zero expression genes in your analysis as it will greatly decrease significance via multiple testing and the FDR/BH correction.

# Filter genes with zero expression in all samples
res_all[which(res_all$baseMean == 0),]        %>%  data.frame() 
res_ko_wt[which(res_ko_wt$baseMean == 0),]    %>%  data.frame() 
res_uk_wt[which(res_uk_wt$baseMean == 0),]    %>%  data.frame() 
res_tissue[which(res_tissue$baseMean == 0),]  %>%  data.frame() 

# Filter genes that have an extreme outlier 
# If a sample has a gene with an extreme outlier, the p-value and adjusted p-value will be set to NA.
res_all[which(is.na(res_all$pvalue)          &  is.na(res_all$padj)),]      %>%  data.frame() 
res_ko_wt[which(is.na(res_ko_wt$pvalue)      &  is.na(res_ko_wt$padj)),]    %>%  data.frame() 
res_uk_wt[which(is.na(res_uk_wt$pvalue)      &  is.na(res_uk_wt$padj)),]    %>%  data.frame() 
res_tissue[which(is.na(res_tissue$pvalue)    &  is.na(res_tissue$padj)),]   %>%  data.frame() 

# Filter genes below the low mean threshold
# Genes with very low counts are not likely to have significant differences
res_all[which(!is.na(res_all$pvalue)         &  is.na(res_all$padj)),]      %>%  data.frame() 
res_ko_wt[which(!is.na(res_ko_wt$pvalue)     &  is.na(res_ko_wt$padj)),]    %>%  data.frame() 
res_uk_wt[which(!is.na(res_uk_wt$pvalue)     &  is.na(res_uk_wt$padj)),]    %>%  data.frame() 
res_tissue[which(!is.na(res_tissue$pvalue)   &  is.na(res_tissue$padj)),]   %>%  data.frame() 

QC

In order to model counts, you need to know if your RNA-seq data meets the basic assumptions, including dispersion.

Dispersion Estimates

This fits a curve to the gene-wise dispersion estimates and plots the expected dispersion value for genes of a given expression strength. Each black dot is a gene with an associated mean expression level and maximum likelihood estimation (MLE) of the dispersion. Read more at https://hbctraining.github.io/DGE_workshop_salmon_online/lessons/04b_DGE_DESeq2_analysis.html

# Get information on each column in results
mcols(res_all,    use.names=T)
mcols(res_ko_wt,  use.names=T)
mcols(res_uk_wt,  use.names=T)
mcols(res_tissue, use.names=T)

# Check the size factors
sizeFactors(dds_all)
sizeFactors(dds_ko_wt)
sizeFactors(dds_uk_wt)
sizeFactors(dds_tissue)

# Total number of raw counts per sample
colSums(counts(dds_all))
colSums(counts(dds_ko_wt))
colSums(counts(dds_uk_wt))
colSums(counts(dds_tissue))

# Look at the total depth after normalization
# Total number of normalized counts per sample
colSums(counts(dds_all,    normalized=T))
colSums(counts(dds_ko_wt,  normalized=T))
colSums(counts(dds_uk_wt,  normalized=T))
colSums(counts(dds_tissue, normalized=T))

# Plot dispersion estimates
# plotDispEsts(dds_all,    main="Dispersion Genotype (All)")
# plotDispEsts(dds_ko_wt,  main="Dispersion Genotype (WT vs KO)")
# plotDispEsts(dds_uk_wt,  main="Dispersion Genotype (WT vs UK)")
plotDispEsts(dds_tissue, main="Dispersion Tissue (Bone vs. Organ)")

Shrinking Estimates

Shrinkage will help to improve the accuracy of count models

# Save the unshrunken results to compare
res_ko_wt_unshrunken   <-  res_ko_wt
res_uk_wt_unshrunken   <-  res_uk_wt
res_tissue_unshrunken  <-  res_tissue

resultsNames(dds_ko_wt)
resultsNames(dds_uk_wt)
resultsNames(dds_tissue)

# Apply fold change shrinkage
res_ko_wt   <-  lfcShrink(dds_ko_wt,  contrast=contrast_ko_wt,  res=res_ko_wt,  type="normal")
res_uk_wt   <-  lfcShrink(dds_uk_wt,  contrast=contrast_uk_wt,  res=res_uk_wt,  type="normal")
res_tissue  <-  lfcShrink(dds_tissue, contrast=contrast_tissue, res=res_tissue, type="normal")

Plot Shrunken Estimates

# MA plot using unshrunken fold changes
plotMA(res_ko_wt_unshrunken,  ylim=c(-6,6))

# plotMA(res_uk_wt_unshrunken,  ylim=c(-6,6))
# plotMA(res_tissue_unshrunken, ylim=c(-6,6))

# MA plot using shrunken fold changes
plotMA(res_ko_wt,  ylim=c(-6,6))

# plotMA(res_uk_wt,  ylim=c(-6,6))
# plotMA(res_tissue, ylim=c(-6,6))

Size factor QC

As assumption of DESeq2 is that most genes will remain unchanged. Plotting the distribution of gene ratios between each gene and the average gene can show how true this is. If some samples show different distribution, there may be bias due to some biological or technical factor. Read more on degCheckFactors at https://rdrr.io/bioc/DEGreport/man/degCheckFactors.html.

degCheckFactors(counts(dds_tissue), each=TRUE)

Prepare your data for DGE

Generate a Results table ith only significant genes

# Summarize results
summary(res_all,    alpha = 0.15)     ##############   THIS SHOULD BE 0.05 OR LESS   ##############
summary(res_ko_wt,  alpha = 0.15)     ##############   THIS SHOULD BE 0.05 OR LESS   ##############
summary(res_uk_wt,  alpha = 0.05)     ##############   THIS SHOULD BE 0.05 OR LESS   ##############
summary(res_tissue, alpha = 0.05)     ##############   THIS SHOULD BE 0.05 OR LESS   ##############

# Set P adjust thresholds
padj.cutoff2   <-  0.05               ##############   THIS SHOULD BE 0.05 OR LESS   ##############
padj.cutoff3   <-  0.15               ##############   THIS SHOULD BE 0.05 OR LESS   ##############

# Create a tibble of results
# A tibble is like a dataframe; the tidyverse package requires you convert dataframes to tibbles
# To learn more about tibbles: https://r4ds.had.co.nz/tibbles.html
res_ko_wt_tb   <-  res_ko_wt   %>%  data.frame() %>% rownames_to_column(var="gene") %>% as_tibble()
res_uk_wt_tb   <-  res_uk_wt   %>%  data.frame() %>% rownames_to_column(var="gene") %>% as_tibble()
res_tissue_tb  <-  res_tissue  %>%  data.frame() %>% rownames_to_column(var="gene") %>% as_tibble()
 
# Subset the tibble to keep only significant genes
# This means we run less statistical tests and dont have to adjust FDR as much.
# ie. instead of adjusting for 10,000 multiple tests, you can just adjust for the most relevent 2,000 tests/genes.
sigGE_ko_wt   <-  res_ko_wt_tb   %>%  dplyr::filter(padj < padj.cutoff3)
sigGE_uk_wt   <-  res_uk_wt_tb   %>%  dplyr::filter(padj < padj.cutoff2)
sigGE_tissue  <-  res_tissue_tb  %>%  dplyr::filter(padj < padj.cutoff2)

# dim(sigGE_ko_wt)     # Check how mny rows/DEGs there are

Normalized Counts Visualization

DESeq2 has its own method of normalization (not TPM or FPKM), called Median of Ratios. This normalization accounts for sequencing depth and RNA composition, but not gene length. For this reason, you should NOT do within sample comparisons because gene lengths can be very different (ie. don’t compare col1a1 to col2a1 in muscle). However, between sample comparisons are fine, because Gene A is the same length between both Group 1 and Group 2. To better understand normalization read: https://hbctraining.github.io/DGE_workshop_salmon_online/lessons/02_DGE_count_normalization.html.

# Add sampletype to the metadata
meta_all      <-  meta_all     %>%  rownames_to_column(var="samplename")  %>%  as_tibble()
meta_ko_wt    <-  meta_ko_wt   %>%  rownames_to_column(var="samplename")  %>%  as_tibble()
meta_uk_wt    <-  meta_uk_wt   %>%  rownames_to_column(var="samplename")  %>%  as_tibble()
meta_tissue   <-  meta_tissue  %>%  rownames_to_column(var="samplename")  %>%  as_tibble()

# First convert normalized_counts to a data frame and transfer the row names to a new column called "gene"
norm_counts_all     <- counts(dds_all,    normalized=T)  %>%  data.frame() %>% rownames_to_column(var="gene") 
norm_counts_ko_wt   <- counts(dds_ko_wt,  normalized=T)  %>%  data.frame() %>% rownames_to_column(var="gene") 
norm_counts_uk_wt   <- counts(dds_uk_wt,  normalized=T)  %>%  data.frame() %>% rownames_to_column(var="gene") 
norm_counts_tissue  <- counts(dds_tissue, normalized=T)  %>%  data.frame() %>% rownames_to_column(var="gene") 

# Merge ensembl IDs the normalized counts data frame with a subset of the annotations in the tx2gene data frame
mus38   <-  tx2gene  %>%  dplyr::select(ensgene, symbol)  %>%  dplyr::distinct()

# This will bring in a column of gene symbols
norm_counts_all     <- merge(norm_counts_all,    mus38, by.x="gene", by.y="ensgene")
norm_counts_ko_wt   <- merge(norm_counts_ko_wt,  mus38, by.x="gene", by.y="ensgene")
norm_counts_uk_wt   <- merge(norm_counts_uk_wt,  mus38, by.x="gene", by.y="ensgene")
norm_counts_tissue  <- merge(norm_counts_tissue, mus38, by.x="gene", by.y="ensgene")

# Now create a tibble for the normalized counts
norm_counts_all     <- norm_counts_all    %>%  as_tibble()
norm_counts_ko_wt   <- norm_counts_ko_wt  %>%  as_tibble()
norm_counts_uk_wt   <- norm_counts_uk_wt  %>%  as_tibble()
norm_counts_tissue  <- norm_counts_tissue %>%  as_tibble()

#colnames(norm_counts_all)   #Check column names

# Normalized Counts are: Median of Ratios 
norm_counts_all     <- norm_counts_all[, c(1,16,2:15)]     #re-order the columns
norm_counts_ko_wt   <- norm_counts_ko_wt[, c(1,12,2:11)]   #re-order the columns
norm_counts_uk_wt   <- norm_counts_uk_wt[, c(1,10,2:9)]    #re-order the columns
norm_counts_tissue  <- norm_counts_tissue[, c(1,16,2:15)]  #re-order the columns

(7) Single Gene Plots

Testing hyotheses about tissue=specifc and cell-specific gene expression

Plot expression of Single Genes

# Find the Ensembl ID of a particular gene.
mus38[mus38$symbol == "Col1a1", "ensgene"]

# Copy the Ensembl number and assign it to the gene name
Col1a1    <-  "ENSMUSG00000022483"
Bglap     <-  "ENSMUSG00000074483"
Tdtomato  <-  "TRANSGENES"

# Save plotcounts to a data frame object for later plotting
d0 <- plotCounts(dds_all,    gene = Col1a1, intgroup ="genotype", returnData=TRUE)
d1 <- plotCounts(dds_ko_wt,  gene = Col1a1, intgroup ="genotype", returnData=TRUE)
d2 <- plotCounts(dds_uk_wt,  gene = Col1a1, intgroup ="genotype", returnData=TRUE)
d3 <- plotCounts(dds_tissue, gene = Col1a1, intgroup ="tissue",   returnData=TRUE)

# What is the data output of plotCounts()?
# d0 %>% Show()
# Plot the  normalized counts, using the sampletypes (rownames(d) as labels)
c0_plot <- ggplot(d1, aes(x = genotype, y = count, color = genotype)) + 
           geom_point(position=position_jitter(w = 0.2,h = 0))  +
           geom_text_repel(aes(label = rownames(d1)))  + theme_bw() + 
           theme(plot.title = element_text(hjust = 0.5))  +
           ggtitle("Col1a1 Counts by Genotype (WT vs KO)")   
c1_plot <- ggplot(d3, aes(x = tissue, y = count, color = tissue)) + 
           geom_point(position=position_jitter(w = 0.2,h = 0))  +
           geom_text_repel(aes(label = rownames(d3)))  + theme_bw() + 
           theme(plot.title = element_text(hjust = 0.5))  +
           ggtitle("Col1a1 Counts by Tissue (Bone vs. Organ)")   
c0_plot

c1_plot

Candidate Gene Approach

# Note that for mouse, capitalize first letter, then lower case. 
# For human genes, all caps.
mus38[mus38$symbol == "Bglap3", "ensgene"]      

# Osteoprogenitor, osteoblast and osteocyte markers
Col1a1      <-  "ENSMUSG00000001506"
Col2a1      <-  "ENSMUSG00000022483"
Dcn         <-  "ENSMUSG00000019929"
Sp7         <-  "ENSMUSG00000060284"
Runx2       <-  "ENSMUSG00000039153"
Bglap       <-  "ENSMUSG00000074483"
Bglap3      <-  "ENSMUSG00000074489" 
Spp1        <-  "ENSMUSG00000029304"
Sparc       <-  "ENSMUSG00000018593"
Tmem119     <-  "ENSMUSG00000054675"
Dmp1        <-  "ENSMUSG00000029307"
Fgf23       <-  "ENSMUSG00000000182"
Sost        <-  "ENSMUSG00000001494"
CD200       <-  "ENSMUSG00000022661"  # ox-2
Itgb1       <-  "ENSMUSG00000025809"  # cd29
Thy1        <-  "ENSMUSG00000032011"  # cd90

# Save plotcounts to a data frame object
cplot1a     <-  plotCounts(dds_all, gene=Col1a1,  intgroup="genotype", returnData=TRUE)
cplot2a     <-  plotCounts(dds_all, gene=Col2a1,  intgroup="genotype", returnData=TRUE)
cplot3a     <-  plotCounts(dds_all, gene=Dcn,     intgroup="genotype", returnData=TRUE)
cplot4a     <-  plotCounts(dds_all, gene=Sp7,     intgroup="genotype", returnData=TRUE)
cplot5a     <-  plotCounts(dds_all, gene=Runx2,   intgroup="genotype", returnData=TRUE)
cplot6a     <-  plotCounts(dds_all, gene=Bglap,   intgroup="genotype", returnData=TRUE)
cplot7a     <-  plotCounts(dds_all, gene=Spp1,    intgroup="genotype", returnData=TRUE)
cplot8a     <-  plotCounts(dds_all, gene=Sparc,   intgroup="genotype", returnData=TRUE)
cplot9a     <-  plotCounts(dds_all, gene=Tmem119, intgroup="genotype", returnData=TRUE)
cplot10a    <-  plotCounts(dds_all, gene=Dmp1,    intgroup="genotype", returnData=TRUE)
cplot11a    <-  plotCounts(dds_all, gene=Fgf23,   intgroup="genotype", returnData=TRUE)
cplot12a    <-  plotCounts(dds_all, gene=Sost,    intgroup="genotype", returnData=TRUE)
cplot13a    <-  plotCounts(dds_all, gene=Bglap3,  intgroup="genotype", returnData=TRUE)
cplot14a    <-  plotCounts(dds_all, gene=CD200,   intgroup="genotype", returnData=TRUE)
cplot15a    <-  plotCounts(dds_all, gene=Itgb1,   intgroup="genotype", returnData=TRUE)
cplot16a    <-  plotCounts(dds_all, gene=Thy1,    intgroup="genotype", returnData=TRUE)

Top20 DEG Plot

#Set p-adjusted value cut off
padj_cutoff     <-  0.15                                                                         ##############   THIS SHOULD BE 0.05 OR LESS   ##############
padj_cutoff2    <-  0.05                                                                         ##############   THIS SHOULD BE 0.05 OR LESS   ##############

sig_res_ko_wt   <-  dplyr::filter(res_ko_wt_tb,  padj < padj_cutoff)  %>% arrange(padj)
sig_res_uk_wt   <-  dplyr::filter(res_uk_wt_tb,  padj < padj_cutoff)  %>% arrange(padj)
sig_res_tissue  <-  dplyr::filter(res_tissue_tb, padj < padj_cutoff2) %>% arrange(padj)

#dim(sig_res_ko_wt)  # Look at rows to see how many DEGs were found

## Order results by padj values
top20_ko_wt     <-  sig_res_ko_wt   %>%  arrange(padj) %>% pull(gene) %>% head(n=20)    
top20_uk_wt     <-  sig_res_uk_wt   %>%  arrange(padj) %>% pull(gene) %>% head(n=20)    
top20_tissue    <-  sig_res_tissue  %>%  arrange(padj) %>% pull(gene) %>% head(n=20)    

## Normalized counts for top 20 significant genes
top20_sig_ko_wt  <- norm_counts_ko_wt   %>%  dplyr::filter(gene %in% top20_ko_wt)
top20_sig_uk_wt  <- norm_counts_uk_wt   %>%  dplyr::filter(gene %in% top20_uk_wt)
top20_sig_tissue <- norm_counts_tissue  %>%  dplyr::filter(gene %in% top20_tissue)

# Gathering the columns to have normalized counts to a single column
gathered_top20_sig_ko_wt  <-  top20_sig_ko_wt  %>%  gather(colnames(top20_sig_ko_wt)[3:12],  key = "sample", value = "normalized_counts")
gathered_top20_sig_uk_wt  <-  top20_sig_uk_wt  %>%  gather(colnames(top20_sig_uk_wt)[3:10],  key = "sample", value = "normalized_counts")
gathered_top20_tissue     <-  top20_sig_tissue %>%  gather(colnames(top20_sig_tissue)[3:16], key = "sample", value = "normalized_counts")

# Check the column header in the "gathered" data frame
# View(gathered_top20_sig_all)

# Combine metadata with the melted normalized countsinto the same data frame for input to ggplot()
gathered_top20_sig_ko_wt   <-  inner_join(meta, gathered_top20_sig_ko_wt)
gathered_top20_sig_uk_wt   <-  inner_join(meta, gathered_top20_sig_uk_wt)
gathered_top20_tissue      <-  inner_join(meta, gathered_top20_tissue)

bk   <- ggplot(gathered_top20_sig_ko_wt) + geom_point(aes(x = symbol, y = normalized_counts, 
        color = genotype)) + xlab("Genes") + ylab("log10 Normalized Counts") +
        ggtitle("Top 20 DEGs Genotype (WT vs KO)") + theme_bw() +  scale_y_log10() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
        theme(plot.title = element_text(hjust = 0.5)) 
bt   <- ggplot(gathered_top20_tissue) + geom_point(aes(x = symbol, y = normalized_counts, 
        color = tissue)) + xlab("Genes") + ylab("log10 Normalized Counts") +
        ggtitle("Top 20 DEGs (Tissue)") + theme_bw() +  scale_y_log10() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
        theme(plot.title = element_text(hjust = 0.5)) 
bk

bt

(8) Volcano Plots

Log2FC Volcano Plot

# TRUE values denote padj values < 0.05 and fold change > 1.5 in either direction
res_ko_wt_tb    <- res_ko_wt_tb    %>%  mutate(threshold_OE=padj < 0.15 & abs(log2FoldChange) >= 0.15)      ##############   THIS SHOULD BE 0.05 OR LESS   ##############
res_uk_wt_tb    <- res_uk_wt_tb    %>%  mutate(threshold_OE=padj < 0.05 & abs(log2FoldChange) >= 0.05)      ##############   THIS SHOULD BE 0.05 OR LESS   ##############
res_tissue_tb   <- res_tissue_tb   %>%  mutate(threshold_OE=padj < 0.05 & abs(log2FoldChange) >= 0.05)      ##############   THIS SHOULD BE 0.05 OR LESS   ##############

# Log2FC Volcano plot
ggplot(res_ko_wt_tb) + geom_point(aes(x = log2FoldChange, y = -log10(padj), 
       colour = threshold_OE)) + ggtitle("Genotype (WT vs KO)") + xlab("log2 fold change") +  
       ylab("-log10 adjusted p-value") + scale_y_continuous(limits = c(0,50)) + 
       theme(legend.position = "none", plot.title = element_text(size = rel(1.5), hjust = 0.5), 
       axis.title = element_text(size = rel(1.25)))  
ggplot(res_tissue_tb) + geom_point(aes(x = log2FoldChange, y = -log10(padj), 
       colour = threshold_OE)) + ggtitle("Tissue (Bone. vs Organ)") + xlab("log2 fold change") +  
       ylab("-log10 adjusted p-value") + scale_y_continuous(limits = c(0,50)) + 
       theme(legend.position = "none", plot.title = element_text(size = rel(1.5), hjust = 0.5), 
       axis.title = element_text(size = rel(1.25)))  

Add Gene Labels

# Add all the gene symbols as a column from the grch38 table using bind_cols()
res_ko_wt_tb   <- bind_cols(res_ko_wt_tb,  symbol = mus38$symbol[match(res_ko_wt_tb$gene,  mus38$ensgene)])
res_uk_wt_tb   <- bind_cols(res_uk_wt_tb,  symbol = mus38$symbol[match(res_uk_wt_tb$gene,  mus38$ensgene)])
res_tissue_tb  <- bind_cols(res_tissue_tb, symbol = mus38$symbol[match(res_tissue_tb$gene, mus38$ensgene)])

# Create an empty column to indicate which genes to label
res_ko_wt_tb   <-  res_ko_wt_tb   %>%  mutate(genelabels = "") 
res_uk_wt_tb   <-  res_uk_wt_tb   %>%  mutate(genelabels = "") 
res_tissue_tb  <-  res_tissue_tb  %>%  mutate(genelabels = "") 

# Sort by padj values 
res_ko_wt_tb   <-  res_ko_wt_tb   %>%  arrange(padj)
res_uk_wt_tb   <-  res_uk_wt_tb   %>%  arrange(padj)
res_tissue_tb  <-  res_tissue_tb  %>%  arrange(padj)

# Populate the genelabels column with contents of the gene symbols column for the first 20 rows, 
# i.e. the top 20 most significantly expressed genes
res_ko_wt_tb$genelabels[1:20]   <-  as.character(res_ko_wt_tb$symbol[1:20])  
res_uk_wt_tb$genelabels[1:20]   <-  as.character(res_uk_wt_tb$symbol[1:20])  
res_tissue_tb$genelabels[1:20]  <-  as.character(res_tissue_tb$symbol[1:20])  

log2fc_wt_ko_plot   <- ggplot(res_ko_wt_tb, aes(x = log2FoldChange, y = -log10(padj))) + 
                       xlab("log2 fold change") + geom_point(aes(colour = threshold_OE)) + 
                       ylim(0,25) + ylab("-log10 adjusted p-value") + 
                       geom_text_repel(aes(label = genelabels)) + 
                       ggtitle("Log2FC Genotype (WT vs KO)") + 
                       theme(legend.position = "none", 
                       plot.title = element_text(size = rel(1.5), hjust = 0.5),
                       axis.title = element_text(size = rel(1.25))) 
log2fc_tissue_plot  <- ggplot(res_tissue_tb, aes(x = log2FoldChange, y = -log10(padj))) + 
                       xlab("log2 fold change") + geom_point(aes(colour = threshold_OE)) + 
                       ylim(0,300) + ylab("-log10 adjusted p-value") + 
                       geom_text_repel(aes(label = genelabels)) + 
                       ggtitle("Log2FC Tissue (Bone vs. Organ)") + 
                       theme(legend.position = "none", 
                       plot.title = element_text(size = rel(1.5), hjust = 0.5),
                       axis.title = element_text(size = rel(1.25))) 
log2fc_wt_ko_plot

log2fc_tissue_plot